{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Svmlight formatted file parsing with sklearn for Learning to Rank data"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "\n",
      "from os.path import expanduser\n",
      "from sklearn.datasets import load_svmlight_file\n",
      "from sklearn.externals import joblib"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "memory = joblib.Memory(cachedir='.', mmap_mode='r')\n",
      "\n",
      "@memory.cache\n",
      "def load_fold(dataset, subset, fold_idx=1, dtype=np.float32):\n",
      "    DATA_FOLDER = expanduser('~/data')\n",
      "    filepath = join(DATA_FOLDER, dataset, 'Fold%d' % fold_idx, subset + '.txt')\n",
      "    X, y, qid = load_svmlight_file(filepath, dtype=dtype, query_id=True)\n",
      "    return X.toarray(), y, qid"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_train, y_train, qid_train = load_fold('MSLR-WEB10K','train', fold_idx=1)\n",
      "X_vali, y_vali, qid_vali = load_fold('MSLR-WEB10K', 'vali', fold_idx=1)\n",
      "X_test, y_test, qid_test = load_fold('MSLR-WEB10K', 'test', fold_idx=1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%time\n",
      "\n",
      "np.savez_compressed(expanduser('~/data/MSLR-WEB10K/mslr_web10k_fold1.npz'),\n",
      "                    X_train=X_train, y_train=y_train, qid_train=qid_train,\n",
      "                    X_vali=X_vali, y_vali=y_vali, qid_vali=qid_vali,\n",
      "                    X_test=X_test, y_test=y_test, qid_test=qid_test)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}